
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export VLLM_ENGINE_ITERATION_TIMEOUT_S=36000
export VLLM_RPC_TIMEOUT=36000000
export VLLM_ENFORCE_CUDA_GRAPH=0

vllm serve /home/weights/Qwen/Qwen3-235B-A22B --served-model-name Qwen3-235B-A22B -tp 8 -pp 2 --max-model-len $[16*1024] --dtype bfloat16 --gpu-memory-utilization 0.95 --max-num-seqs 128 --generation-config vllm --enforce-eager --enable-chunked-prefill --enable-reasoning --reasoning-parser qwen3  --host 0.0.0.0 --port 8000

# vllm serve /home/weights/Qwen/Qwen3-235B-A22B --served-model-name Qwen3-235B-A22B -tp 16 -pp 1 --max-model-len $[16*1024] --dtype bfloat16 --gpu-memory-utilization 0.95 --max-num-seqs 128 --generation-config vllm --enforce-eager --enable-chunked-prefill --enable-reasoning --reasoning-parser qwen3  --host 0.0.0.0 --port 8000